Updated: Sep 20, 2023 at 22:00:52 PDT.
Working directory: /home/groups/jgrimmer/trump_blocs.
Reads:
Writes:
rm(list = ls())
library(dplyr)
library(data.table)
census <- fread("data/ipums-census/usa_00006.csv")
census[ ,
`:=`(
year = as.character(YEAR),
statefips = coler::lz_pad(STATEFIP, 2),
weight = PERWT / 100,
urban = recode(METRO, "1" = "rural", "2" = "urban", "3" = "suburban",
.default = NA_character_),
gender = recode(SEX, `1` = "male", `2` = "female",
.default = NA_character_),
age = as.numeric(AGE),
race = recode(RACE, "1" = "white", "2" = "black",
.default = NA_character_),
hisp_origin = recode(HISPAN, "1" = "mexican", "2" = "puerto rican",
"3" = "cuban", "4" = "other",
.default = NA_character_),
educ = as.numeric(EDUCD) %>%
{
case_when(. < 64 ~ "HS or less",
# Includes associate's degree
. %in% 65:99 ~ "some college",
# Includes 4+ years of college
. %in% c(100:116) ~ "college")
},
region = REGION %>%
{
case_when(. %in% 11:13 ~ "Northeast",
. %in% 21:23 ~ "Midwest",
. %in% 31:34 ~ "South",
. %in% 41:43 ~ "West",
. %in% 91:99 ~ NA_character_)
}
)
] %>%
filter(age >= 18)
# Recode missing values NOW for proper quantiles
census[ ,
fincome := case_when(FTOTINC %in%
c(9999998, 9999999) ~ NA_integer_,
# net loss
# FTOTINC == -1 ~ 0,
TRUE ~ FTOTINC)]
census[ ,
`:=`(fincome_anes = cut(fincome,
labels = c("0-16ptile", "17-33ptile", "34-67ptile",
"68-95ptile", "96-100ptile"),
breaks = quantile(fincome,
probs = c(0, .16, .33, .67,
.95, 1),
na.rm = TRUE),
include.lowest = TRUE),
faminc_quin = ntile(fincome, 5) %>%
recode_factor(`1` = "1st",
`2` = "2nd",
`3` = "3rd",
`4` = "4th",
`5` = "5th"
)
),
by = .(year)
]
# Recode Hispanic from white
census[HISPAN %in% 1:4, race := "hispanic"]
Subset to B/W/H.
census <- dplyr::filter(census, race %in% c("black", "white", "hispanic"))
Copied from Will’s CCES code: age groups in 10-year bands, except under 20 is counted w/ 20s
mfloor <- function(x,base){
base * floor(x/base)
}
census[ , age_bin := mfloor(age, 10)]
## Warning in `[.data.table`(census, , `:=`(age_bin, mfloor(age, 10))): Invalid
## .internal.selfref detected and fixed by taking a (shallow) copy of the
## data.table so that := can add this new column by reference. At an earlier
## point, this data.table has been copied by R (or was created manually using
## structure() or similar). Avoid names<- and attr<- which in R currently (and
## oddly) may copy the whole data.table. Use set* syntax instead to avoid copying:
## ?set, ?setnames and ?setattr. If this message doesn't help, please report your
## use case to the data.table issue tracker so the root cause can be fixed or this
## message improved.
census[age < 20, age_bin := 20]
fips <- fastLink::statefips
setDT(fips, key = "statefips")
fips[ , south := as.numeric(state %in%
c("TN", "VA", "NC", "SC", "FL",
"GA", "AL", "MS", "LA", "AR", "TX",
# Schickler adds these
"OK", "KY"))
]
census[fips,
c("south", "state") := .(i.south, i.state),
on = "statefips"
]
sources <-
tribble( ~ SAMPLE, ~ source,
202001, "ACS",
201801, "ACS",
201601, "ACS",
201401, "ACS",
201201, "ACS",
201007, "Census",
201001, "ACS",
200801, "ACS",
200601, "ACS",
200401, "ACS",
200201, "ACS",
200001, "Census",
199001, "Census",
198001, "Census",
197002, "Census",
197001, "Census",
196002, "Census",
195001, "Census")
setDT(sources, key = "source")
census[sources,
source := i.source,
on = "SAMPLE"]
keep <- grep("[a-z]", names(census), value = TRUE)
census_out <- dplyr::select(census, all_of(keep))
setDT(census_out)
skimr::skim(census_out)
| Name | census_out |
| Number of rows | 104064467 |
| Number of columns | 17 |
| Key | NULL |
| _______________________ | |
| Column type frequency: | |
| character | 9 |
| factor | 3 |
| numeric | 5 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| year | 0 | 1.00 | 4 | 4 | 0 | 16 | 0 |
| statefips | 0 | 1.00 | 2 | 2 | 0 | 51 | 0 |
| urban | 48085787 | 0.54 | 5 | 8 | 0 | 3 | 0 |
| gender | 0 | 1.00 | 4 | 6 | 0 | 2 | 0 |
| race | 0 | 1.00 | 5 | 8 | 0 | 3 | 0 |
| hisp_origin | 91474474 | 0.12 | 5 | 12 | 0 | 4 | 0 |
| educ | 29142747 | 0.72 | 7 | 12 | 0 | 3 | 0 |
| region | 0 | 1.00 | 4 | 9 | 0 | 4 | 0 |
| source | 0 | 1.00 | 3 | 6 | 0 | 2 | 0 |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| fincome_anes | 32281552 | 0.69 | FALSE | 5 | 34-: 24453626, 68-: 19913998, 17-: 12391111, 0-1: 11508151 |
| faminc_quin | 32281552 | 0.69 | FALSE | 5 | 3rd: 14450935, 2nd: 14446234, 4th: 14381098, 1st: 14347783 |
| state | 0 | 1.00 | FALSE | 51 | CA: 10754176, TX: 7664636, NY: 7222638, FL: 5632367 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| weight | 0 | 1.00 | 0.44 | 0.68 | 0 | 0.1 | 0.2 | 0.48 | 4.044e+01 | ▇▁▁▁▁ |
| age | 0 | 1.00 | 36.60 | 22.98 | 0 | 17.0 | 35.0 | 54.00 | 1.000e+02 | ▇▇▆▃▁ |
| fincome | 32281552 | 0.69 | 48278.34 | 64558.81 | -39996 | 10925.0 | 29215.0 | 61500.00 | 3.164e+06 | ▇▁▁▁▁ |
| age_bin | 0 | 1.00 | 36.51 | 18.27 | 20 | 20.0 | 30.0 | 50.00 | 1.000e+02 | ▇▃▂▁▁ |
| south | 0 | 1.00 | 0.33 | 0.47 | 0 | 0.0 | 0.0 | 1.00 | 1.000e+00 | ▇▁▁▁▃ |
fst::write_fst(census_out, "data/census_clean.fst")
# Sample to work with locally
fwrite(census_out %>% group_by(year) %>%
sample_frac(0.001),
"data/ipums-census/census_sample.csv")